library(gridExtra)
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## âś” ggplot2 3.4.0 âś” purrr 0.3.5
## âś” tibble 3.1.8 âś” dplyr 1.0.10
## âś” tidyr 1.2.1 âś” stringr 1.5.0
## âś” readr 2.1.3 âś” forcats 0.5.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## âś– dplyr::combine() masks gridExtra::combine()
## âś– dplyr::filter() masks stats::filter()
## âś– dplyr::lag() masks stats::lag()
library(dplyr)
library(data.table)
##
## Attaching package: 'data.table'
##
## The following objects are masked from 'package:dplyr':
##
## between, first, last
##
## The following object is masked from 'package:purrr':
##
## transpose
library(ggplot2)
library(reshape2)
##
## Attaching package: 'reshape2'
##
## The following objects are masked from 'package:data.table':
##
## dcast, melt
##
## The following object is masked from 'package:tidyr':
##
## smiths
library(rsample)
library(recommenderlab)
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
##
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
##
## Loading required package: arules
##
## Attaching package: 'arules'
##
## The following object is masked from 'package:dplyr':
##
## recode
##
## The following objects are masked from 'package:base':
##
## abbreviate, write
##
## Loading required package: proxy
##
## Attaching package: 'proxy'
##
## The following object is masked from 'package:Matrix':
##
## as.matrix
##
## The following objects are masked from 'package:stats':
##
## as.dist, dist
##
## The following object is masked from 'package:base':
##
## as.matrix
##
## Loading required package: registry
## Registered S3 methods overwritten by 'registry':
## method from
## print.registry_field proxy
## print.registry_entry proxy
library(ggpubr)
library(lsa)
## Loading required package: SnowballC
library(SnowballC)
data(MovieLense)
mx_user_film <- as(MovieLense, "matrix") # convert realratingmatrix to normal matrix
df_user_film <- as.data.frame(mx_user_film) # convert matrix to dataframe form
df_film_user <- as.data.frame(t(mx_user_film)) # transpose the dataframe: each row is a movie name, each column is a user
df_11 <- df_film_user %>% mutate(cnt = rowSums(!is.na(df_film_user))) %>% arrange(desc(cnt)) %>% filter(cnt == max(cnt)) %>% select('cnt')
print(paste("The most frequently watched film is ", rownames(df_11)))
## [1] "The most frequently watched film is Star Wars (1977)"
df_unlist <- data.frame(rating=unlist(df_film_user)) # unlist the dataframe
ggplot(df_unlist,aes(rating)) + geom_histogram(binwidth=1) + # die Verteilung der Kundenratings gesamthaft
labs(x="Ratings", y="Count",title="Distribution of the user ratings") +
theme(plot.title = element_text(hjust = 0.5))
## Warning: Removed 1469760 rows containing non-finite values (`stat_bin()`).

mx_film_genre <- as.data.frame(MovieLenseMeta)
rownames(mx_film_genre) <- mx_film_genre$title
mx_film_genre <- as.matrix(mx_film_genre[,5:22]) # Movie Genre Matrix
mx_user_film[is.na(mx_user_film)] <- 0
mx_user_genre <- mx_user_film %*% mx_film_genre
mx_genre_user <- as.data.frame(t(mx_user_genre)) # a: Stärke Genre Kombination vollständig
mx_genre_user$summe <- rowSums(mx_genre_user) # new column "summe": summe user ratings of each genre
mx_genre_user <- cbind(genre = rownames(mx_genre_user), mx_genre_user)# new column "genre": genre name copied from rownames
df_12 <- as.data.frame(t(mx_film_genre)) %>% mutate(cnt = rowSums(as.data.frame(t(mx_film_genre))))%>% arrange(desc(cnt)) # add new column: count for each genres
df_12 <- cbind(genres = rownames(df_12),df_12) # index as a column
rownames(df_12) <- 1:nrow(df_12) # generate new index
g1 = ggplot(df_12,aes(x = (reorder(genres,cnt)), y = cnt)) + geom_col() + coord_flip() +
labs(y="Number of Views", x="Genres",title="Distribution by genres") +
theme(plot.title = element_text(hjust = 0.5))
g2 = ggplot(mx_genre_user,aes(x=reorder(genre,summe),y=summe)) + geom_col() + coord_flip()+ labs( x="Genre",y= "summed ratings of all users",title="Distribution by genre") +
theme(plot.title = element_text(hjust = 0.5))
mx_genre_user <- mx_genre_user %>% select(-genre)
grid.arrange(g1, g2, nrow = 1)
df_avg_rating_film <- df_film_user %>% mutate(avg_rating = rowMeans(df_film_user,na.rm = TRUE, dims = 1)) %>% select('avg_rating')
g1 = ggplot(df_avg_rating_film,aes(avg_rating)) + geom_histogram(bins = 5,fill="black", col="grey") +
# die Verteilung
labs(x="average ratings", y="Count of films",title="Distribution of avg-ratings per film") +
theme(plot.title = element_text(hjust = 0.5))
df_avg_rating_user <- df_user_film %>% mutate(avg_rating = rowMeans(df_user_film,na.rm = TRUE, dims = 1)) %>% select('avg_rating')
g2 = ggplot(df_avg_rating_user,aes(avg_rating)) + geom_histogram(bins = 5,fill="black", col="grey") +
# die Verteilung
labs(x="average ratings", y="Count of users",title="Distribution of avg-ratings per user")+
theme(plot.title = element_text(hjust = 0.5))
grid.arrange(g1, g2, nrow = 1)
left plot: Here I calculated the average rating of each film and counted them. The most frequent average ratings are 3 and 4. Together, at least 75% of the movies have average ratings of 3 or 4. Very few films become an average rating of 5. Reasons could be either the films are only watched and rated by a small group and this group of users really love them, or the films are simply good made that all the viewers have given the best ratings.
right plot: here I calculated the average ratings by user to check the user rating habits. The most of the users have the average ratings around 3.3 and 4.2. The plot shows us that the users have very differently rating preferences.
dev.new(width=8, height=2, unit="in")
df_avg_rating_user <- df_user_film %>% mutate(avg_rating = rowMeans(df_user_film,na.rm = TRUE, dims = 1)) %>% select('avg_rating')
g1 = ggplot(df_avg_rating_user,aes(avg_rating)) + geom_histogram(bins = 5,fill="black", col="grey") +
# die Verteilung
labs(x="average ratings per user", y="Count of users",title="Non-normalized ratings per user")+
theme(plot.title = element_text(hjust = 0.5))
normalized_movielens <- as(normalize(MovieLense,method = "z-score"), "matrix")
normalized_movielens <- as.data.frame(normalized_movielens)
normalized_avg_rating_user <- normalized_movielens %>% mutate(avg_rating=rowMeans(normalized_movielens,na.rm=TRUE, dims=1)) %>% select('avg_rating')
g2 = ggplot(normalized_avg_rating_user,aes(avg_rating)) + geom_histogram(bins = 5,fill="black", col="grey") + # die Verteilung
labs(x="average normalized ratings per user", y="Count of users",title="z-score Normalized ratings per user") +
theme(plot.title = element_text(hjust = 0.5))
grid.arrange(g1, g2, nrow = 1)
df_reduced <- df_film_user %>% mutate(n_pro_film = rowSums(!is.na(df_film_user))) %>% arrange(desc(n_pro_film)) %>% slice(0:700)%>% select(-n_pro_film) # reduce to 700 movies
df_reduced <- as.data.frame(t(df_reduced))
df_reduced <- df_reduced %>% mutate(n_pro_user = rowSums(!is.na(df_reduced))) %>% arrange(desc(n_pro_user)) %>% slice(0:400) %>% select(-n_pro_user) # reduce to 400 users
print(dim(df_user_film)); print(dim(df_reduced))
## [1] 943 1664
## [1] 400 700
print(paste0('Before data reduction, NA data proportion is ',sum(is.na(df_user_film))/(1664*943)))
## [1] "Before data reduction, NA data proportion is 0.936658781303532"
print(paste0('After data reduction, NA data proportion is ',sum(is.na(df_reduced))/(700*400)))
## [1] "After data reduction, NA data proportion is 0.758992857142857"
df_film_cnt_vor_red <- df_film_user %>% mutate(cnt = rowSums(!is.na(df_film_user)),id=1:1664) %>% select(cnt,id)%>% arrange(desc(cnt))
df_film_cnt_nach_red <- as.data.frame(t(df_reduced)) %>% mutate(cnt = rowSums(!is.na(as.data.frame(t(df_reduced)))),id=1:700) %>% select(cnt,id) %>% arrange(desc(cnt))
df_user_cnt_vor_red <- df_user_film %>% mutate(cnt = rowSums(!is.na(df_user_film)),id=1:943) %>% select(cnt,id)%>% arrange(desc(cnt))
df_user_cnt_nach_red <- df_reduced %>% mutate(cnt = rowSums(!is.na(df_reduced)),id=1:400) %>% select(cnt,id) %>% arrange(desc(cnt))
g1 = ggplot(df_film_cnt_vor_red, aes(reorder(id,-cnt),cnt))+geom_col(fill="black", col="black")+
labs(x = "Film", y = "number of ratings", title = "film popularity before reduction") +
theme(plot.title = element_text(hjust = 0.5),axis.ticks.x=element_blank(),axis.text.x = element_blank())
g2 = ggplot(df_film_cnt_nach_red, aes(reorder(id,-cnt),cnt))+geom_col(fill="black", col="black")+
labs(x = "Film", y = "number of ratings", title = "film popularity after reduction") +
theme(plot.title = element_text(hjust = 0.5),axis.ticks.x=element_blank(),axis.text.x = element_blank())
g3 = ggplot(df_user_cnt_vor_red, aes(reorder(id,-cnt),cnt))+geom_col(fill="black", col="black")+
labs(x = "User", y = "number of ratings", title = "User rating frequency before reduction") +
theme(plot.title = element_text(hjust = 0.5),axis.ticks.x=element_blank(),axis.text.x = element_blank())
g4 = ggplot(df_film_cnt_nach_red, aes(reorder(id,-cnt),cnt))+geom_col(fill="black", col="black")+
labs(x = "User", y = "number of ratings", title = "User rating frequency after reduction") +
theme(plot.title = element_text(hjust = 0.5),axis.ticks.x=element_blank(),axis.text.x = element_blank())
grid.arrange(g1,g2,g3,g4,nrow=2,ncol=2)
par(mfrow=c(1,2))
image(as(df_user_film,"matrix"), main = "sparsity before reduction")
image(as(df_reduced,"matrix"),main = "sparsity after reduction")
par(mfrow=c(1,2))
df_avg_rating <- df_film_user %>% mutate(avg_rating = rowMeans(df_film_user,na.rm = TRUE, dims = 1)) %>% select('avg_rating')
g1=ggplot(df_avg_rating,aes(avg_rating)) + geom_histogram(bins = 5,fill="black", col="grey") + # die Verteilung
labs(x="average ratings per film", y="Count",title="Before reduction") +
theme(plot.title = element_text(hjust = 0.5))
df_reduced_t <- as.data.frame(t(df_reduced))
df_reduced_avg_rating <- df_reduced_t%>% mutate(avg_rating = rowMeans(df_reduced_t,na.rm = TRUE, dims = 1)) %>% select('avg_rating')
g2=ggplot(df_reduced_avg_rating,aes(avg_rating)) + geom_histogram(bins = 5,fill="black", col="grey") + # die Verteilung
labs(x="average ratings per film", y="Count",title="After reduction") +
theme(plot.title = element_text(hjust = 0.5))
grid.arrange(g1,g2,nrow=1)
set.seed(1965)
mx_reduced <- as.matrix(df_reduced)
rrm_reduced <- as(mx_reduced,"realRatingMatrix")
train_test <- evaluationScheme(rrm_reduced, method="split", train=0.8, k=1, given=15, goodRating=4)
# training data 80% of the users
rrm_reduced_train <- getData(train_test,"train")
# test data is 20% of the all users, the test data is splited into two parts: known test data and unknown test data
# the known portion returns specified 50 film ratings per test user which will be used to predict ratings or films for the test users
rrm_reduced_known <- getData(train_test,"known")
# the rest of film ratings of the test users are in unknown portion, which will be used to evaluate the prediction
rrm_reduced_unknown <- getData(train_test,"unknown")
dim(rrm_reduced_train);dim(rrm_reduced_known);dim(rrm_reduced_unknown)
## [1] 320 700
## [1] 80 700
## [1] 80 700
# understand known and unknown test data
sum(!is.na(t(as.data.frame(as(rrm_reduced_known,'matrix'))[2,]))) # second user, known data
## [1] 15
sum(!is.na(t(as.data.frame(as(rrm_reduced_unknown,'matrix'))[2,]))) # second user, unknown data
## [1] 379
sum(!is.na(t(as.data.frame(as(rrm_reduced_known,'matrix'))[11,]))) # 11th user, known data
## [1] 15
sum(!is.na(t(as.data.frame(as(rrm_reduced_unknown,'matrix'))[11,]))) # 11th user, unknown data
## [1] 223
For example, the second user in test set has 15 film ratings in known set, 379 film ratings in unknown set. The intersection of known and unknow set is null.
The 11th user in test set has 15 film ratings in known set, 223 film ratings in unknown set.
model_IBCF <- Recommender(data = rrm_reduced_train,method="IBCF",parameter=list(normalize = "Z-score",method="Cosine",k=30))
welche bei IBCF für paarweise Ähnlichkeitsvergleiche verwendet werden.
Determine the distribution of films used in IBCF for pairwise similarity comparisons
# Here only exhibit the first 50 rows and columns
get_model_IBCF <- getModel(model_IBCF)
par(mfrow=c(1,2))
g1 = image(get_model_IBCF$sim, main = "Similarity matrix")
g2 = image(get_model_IBCF$sim[1:50, 1:50], main = "Similarity matrix of first 50 rows and cols")
grid.arrange(g1,g2,nrow=1,ncol=2)
IBCF_sim <- as.data.frame(colSums(get_model_IBCF$sim > 0))
colnames(IBCF_sim) <- "recommended_frequency" # frequency that the corresponding film is included in other films' TOP-N lists
ggplot(IBCF_sim, aes(x=IBCF_sim$recommended_frequency))+geom_histogram(fill="black", col="grey",binwidth = 5)+
labs(x = "Recommended frequency", y = "Count", title = "Distribution of recommended frequency per film") +
theme(plot.title = element_text(hjust = 0.5))
The plot shows the distribution of recommended frequency of films. For instance, about 52 films have the recommended frequency of 0, this means they are not included in any TOP recommending lists of other films. About 100 films are included in the TOP lists of 5 films. The highest frequency is about 160, but only a small amount of films have so high recommended frequency. . #### 3.4 Bestimme die Filme,
die am häufigsten in der Cosine-Ähnlichkeitsmatrix auftauchen und analysiere deren Vorkommen und Ratings im reduzierten Datensatz.
die am häufigsten Film in der Cosine-Ähnlichkeitsmatrix
# die 10 am häufigsten Filme in der Cosine Ähnlichkeitsmatrix, i.e. die Filme mit der höheste sum Ähnlichkeits
high_freq_film <- IBCF_sim %>% mutate(film = rownames(IBCF_sim)) %>% arrange(desc(recommended_frequency)) %>% slice(0:10) %>% select(film,recommended_frequency)
rownames(high_freq_film) <- 1:10
t <- df_reduced_t %>% mutate(is_NA = rowSums(is.na(df_reduced_t)),not_NA = rowSums(!is.na(df_reduced_t)),occurrence = rowSums(!is.na(df_reduced_t))/dim(df_reduced_t)[2], film = rownames(df_reduced_t)) %>% select(is_NA,not_NA,occurrence,film)
Occurrence <- left_join(high_freq_film,t,by = "film") %>% select(film,recommended_frequency,occurrence)
t2 <- df_reduced_t %>% mutate(film = rownames(df_reduced_t), avg_rating = rowMeans(df_reduced_t,na.rm=TRUE))%>% select(film,avg_rating)
high_freq_film <- left_join(Occurrence,t2,by = "film")
rownames(high_freq_film) <- 1:10
high_freq_film # occurrence: not_NA / user number, the user ratio that rated this film
g1 = ggplot(high_freq_film, aes(x= reorder(film,recommended_frequency), y=recommended_frequency)) + geom_bar(stat="identity")+coord_flip() +
labs(y = "Recommended frequency", x = "Films")
g2 = ggplot(high_freq_film, aes(x= occurrence, y=reorder(film,recommended_frequency))) + geom_bar(stat="identity")+
labs(x = "occurrence", y = "Films")
g3 = ggplot(high_freq_film, aes(x= avg_rating, y=reorder(film,recommended_frequency))) + geom_bar(stat="identity")+
labs(x = "average ratings", y = "Films")
ggarrange(g1, g2, g3,labels = c("A", "B", "C"),ncol = 1, nrow = 3)
The three plots present frequency, occurrencem average ratings of the ten most often recommended films, sorted by frequency.
recommended_frequency: the frequency that this item appears in the top-n recommendation list of other users.
occurrence: frequency that the film is rated divided by number of all users
avg_rating: the average rating of each film
From the result, we could see that, the most often recommended film Mouse Hunt has a relatively low average rating 2.32, and medium occurrence. This means there are other factors influence the recommend results.
um (a) für ordinale Ratings effizient die Cosine Similarity und (b) für binäre Ratings effizient die Jaccard Similarity zu berechnen #### 4.1a Implementiere eine Funktion,
um fĂĽr ordinale Ratings effizient die Cosine Similarity zu berechnen,
write a function cos_similarity to calculate the film cosine similarity with ordinal ratings
cos_similarity <- function(mx){
n <- dim(mx)[2]
mx_0 <- mx
mx_0[is.na(mx_0)] <- 0
sim_mx <- matrix(1:n^2, nrow = n)
for(i in 1:n){
for(j in 1:n){
numerator <- t(mx_0[,i]) %*% mx_0[,j]
denominator <- sqrt(sum(mx_0[,i]^2))*sqrt(sum(mx_0[,j]^2))
sim_mx[i,j] <- numerator/denominator
}
}
rownames(sim_mx) <- colnames(mx)
colnames(sim_mx) <- colnames(mx)
return(sim_mx)
}
# randomly select 100 films
set.seed(1)
df_100 <- df_reduced[,sample(1:ncol(df_reduced),100)]
cos_sim_reduced_1 <- cos_similarity(as(df_100,'matrix'))
as.data.frame(cos_sim_reduced_1)
rownames(cos_sim_reduced_1)<-NULL
colnames(cos_sim_reduced_1) <- NULL
heatmap(cos_sim_reduced_1,Colv = NA, Rowv = NA,main = "cosine similarity by my function")
um für binäre Ratings effizient die Jaccard Similarity zu berechnen
write a function to calculate the jaccard similarity of binary ratings
Jacc_similarity <- function(mx){
mx_bi <- mx
mx_bi[mx_bi <= 3] <- 0
mx_bi[is.na(mx_bi)] <- 0 # the NA and ratings <= 3 all converted as 0
mx_bi[mx_bi > 3] <- 1 # the ratings > 3 (which shows a preference) converted as 1
n <- dim(mx_bi)[2]
sim_mx <- matrix(1:n^2, nrow = n) # create a matrix with dimention of n x n for similarity
for(i in 1:n){
for(j in 1:n){
list1 = mx_bi[,i]
list2 = mx_bi[,j]
uni = 0 # union of film i and j
ins = 0 # intersection of film i and j
for(m in 1:dim(mx_bi)[1]){
if (list1[m] == list2[m] & (list1[m]==0|list1[m]==1)){
ins = ins + 1
uni = uni + 1}
if (list1[m] != list2[m] & !is.na(list1[m]) & !is.na(list2[m])){
uni = uni + 1}
}
sim_mx[i,j] <- ins/uni # intersection/union
}}
rownames(sim_mx) <- colnames(mx)
colnames(sim_mx) <- colnames(mx)
return(sim_mx)
}
Jacc_sim_reduced <- Jacc_similarity(as(df_100,'matrix')) # a 700 x 700 similarity matrix
as.data.frame(Jacc_sim_reduced)
rownames(Jacc_sim_reduced)<-NULL
colnames(Jacc_sim_reduced) <- NULL
heatmap(Jacc_sim_reduced,Colv = NA, Rowv = NA,main = "Jaccard similarity by my function")
mx_100_0 <- as(df_100,'matrix')
mx_100_0[is.na(mx_100_0)]<-0 # replace NA as 0
# calculate the cosine similarity matrix by open source package lsa
cos_sim_reduced_2 <- cosine(mx_100_0)
as.data.frame(cos_sim_reduced_2) # display a small part of the result
rownames(cos_sim_reduced_2)<-NULL
colnames(cos_sim_reduced_2) <- NULL
heatmap(cos_sim_reduced_2,Colv = NA, Rowv = NA,main = "cosine similarity by library lsa")
# compare the results by my function and the function from lsa library
compare_two_cos_sim_methods <- all.equal(cos_sim_reduced_1, cos_sim_reduced_2, tolerance = 1e-10,check.attributes = FALSE)
compare_two_cos_sim_methods
## [1] TRUE
cos_unlist <- data.frame(cos_sim=unlist(as.data.frame(cos_sim_reduced_1))) # unlist the dataframe
g1 = ggplot(cos_unlist,aes(cos_sim)) + geom_histogram(bins=50) + # die Verteilung der Kundenratings gesamthaft
labs(x="cosine similarity", y="Count",title="Distribution of cosine similarities") +
theme(plot.title = element_text(hjust = 0.5))
jac_unlist <- data.frame(jac_sim=unlist(as.data.frame(Jacc_sim_reduced))) # unlist the dataframe
g2 = ggplot(jac_unlist,aes(jac_sim)) + geom_histogram(bins=50) + # die Verteilung der Kundenratings gesamthaft
labs(x="jaccard similarity", y="Count",title="Distribution of jaccard similarities") +
theme(plot.title = element_text(hjust = 0.5))
grid.arrange(g1,g2,nrow=1,ncol=2)
compare_cos_Jacc <- all.equal(cos_sim_reduced_1,Jacc_sim_reduced,tolerance = 1e-3,check.attributes = FALSE)
print(paste0("differences between cosine and jaccard similarity is: ",compare_cos_Jacc))
## [1] "differences between cosine and jaccard similarity is: Mean relative difference: 2.173"
print(paste0("mean of cosine similarities is: ", mean(cos_unlist$cos_sim)))
## [1] "mean of cosine similarities is: 0.252113218211702"
print(paste0("mean of jaccard similarities is: ", mean(jac_unlist$jac_sim)))
## [1] "mean of jaccard similarities is: 0.7840795"
## top-N recommendations for testdata users with IBCF
Pred_IBCF <- predict(object = model_IBCF, newdata = rrm_reduced_known, n = 15,type=c("topNList"))
TOP15_IBCF <- sapply(Pred_IBCF@items, function(x) {colnames(df_reduced)[x]})
colnames(TOP15_IBCF) <- rownames(as(rrm_reduced_known,"matrix"))
as.data.frame(TOP15_IBCF) # each column represents the top 15 recommendations for one user, column names are UserID
# predict with UBCF
model_UBCF <- Recommender(rrm_reduced_train,method="UBCF",param=list(normalize = "Z-score",method="Cosine",nn=30)) #model
# top-N recommendations for testdata users with UBCF
Pred_UBCF <- predict(object = model_UBCF, newdata = rrm_reduced_known, n = 15,type=c("topNList"))
TOP15_UBCF <- sapply(Pred_UBCF@items, function(x) {colnames(df_reduced)[x]})
colnames(TOP15_UBCF) <- rownames(as(rrm_reduced_known,"matrix"))
as.data.frame(TOP15_UBCF) # each column represents the top 15 recommendations for one test user
Vergleiche die Top 15 Empfehlungen und deren Verteilung und diskutiere Gemeinsamkeiten und Unterschiede zwischen IBCF und UBCF fĂĽr alle Testkunden.
IBCF and UBCF models give completely different Top-15 recommendations for the same user.
First identify the most recommended movies in the TOP-15 list from all test users.
# generate frequency tables : all the recommendation films with the corresponding frequencies
film_freq_IBCF <- as.data.frame(table(as.factor(TOP15_IBCF)))
colnames(film_freq_IBCF) <- c("Film_by_IBCF", "Frequency")
film_freq_UBCF <- as.data.frame(table(as.factor(TOP15_UBCF)))
colnames(film_freq_UBCF) <- c("Film_by_UBCF", "Frequency")
film_freq_IBCF %>% arrange(desc(Frequency))
film_freq_UBCF %>% arrange(desc(Frequency))
g1 = ggplot(head(film_freq_IBCF %>% arrange(desc(Frequency)),15),aes(x = reorder(Film_by_IBCF,Frequency), y = Frequency)) + geom_col() + coord_flip() +
labs(y="Frequency", x="Films",title="IBCF Top-15 films for all users") +
theme(plot.title = element_text(hjust = 0.5))
g2 = ggplot(head(film_freq_UBCF %>% arrange(desc(Frequency)),15),aes(x = reorder(Film_by_UBCF,Frequency), y = Frequency)) + geom_col() + coord_flip() +
labs(y="Frequency", x="Films",title="UBCF Top-15 films for all users") +
theme(plot.title = element_text(hjust = 0.5))
grid.arrange(g1,g2,nrow=2,ncol=1)
# the test user "unknown" ratings
mx_reduced_unknown <- as(rrm_reduced_unknown,"matrix")
# evaluate recommendations on "unknown" ratings
acc_IB <- calcPredictionAccuracy(predict(object = model_IBCF, newdata = rrm_reduced_known, n = 15,type="topNList"),rrm_reduced_unknown,given=15,goodRating=4)
acc_UB <- calcPredictionAccuracy(predict(object = model_UBCF, newdata = rrm_reduced_known, n = 15,type="topNList"),rrm_reduced_unknown,given=15,goodRating=4)
acc_ordinal <- rbind(acc_IB,acc_UB)
rownames(acc_ordinal) <- c("IBCF ordinal","UBCF ordinal")
acc_ordinal <- as.data.frame(acc_ordinal) %>% mutate(f1_score = 2*precision*recall/(precision + recall))
as(acc_ordinal,'matrix')
## TP FP FN TN N precision recall
## IBCF ordinal 6.1625 8.8375 76.4000 593.6000 685 0.4108333 0.08378385
## UBCF ordinal 1.8750 13.1250 80.6875 589.3125 685 0.1250000 0.02345749
## TPR FPR f1_score
## IBCF ordinal 0.08378385 0.01446599 0.13918319
## UBCF ordinal 0.02345749 0.02181942 0.03950203
# convert the reduced dataset to binary: ratings > 3 converted as 1 (like), ratings <= 3 converted as 0 (not like)
df_reduced_bi <- df_reduced
df_reduced_bi[df_reduced_bi <= 2] <- 0
df_reduced_bi[df_reduced_bi > 2] <- 1
set.seed(468)
mx_reduced_bi <- as.matrix(df_reduced_bi)
rrm_reduced_bi <- as(mx_reduced_bi,"realRatingMatrix")
train_test_bi <- evaluationScheme(rrm_reduced_bi, method="split", train=0.8, k=1, given=15)
# training data 80% of the users
rrm_reduced_train_bi <- getData(train_test_bi,"train")
# test data is 20% of the all users, the test data is splited into two parts: known test data and unknown test data
# the known portion returns specified 20 items per test user is used to predict ratings or films for the test users
rrm_reduced_known_bi <- getData(train_test_bi,"known")
# the unknown portion is used to compute the prediction error of the model
rrm_reduced_unknown_bi <- getData(train_test_bi,"unknown")
# train the IBCF or UBCF model on training dataset
model_IBCF_bi <- Recommender(data = rrm_reduced_train_bi,method="IBCF",parameter=list(normalize = "Z-score",method="Jaccard",k=30))
model_UBCF_bi <- Recommender(data = rrm_reduced_train_bi,method="UBCF",parameter=list(normalize = "Z-score",method="Jaccard",nn=30))
# predict the ratings of test users by IBCF and UBCF
pred_rating_IBCF_bi <- as(predict(object = model_IBCF_bi, newdata = rrm_reduced_known_bi, n = 15,type="ratings"),"matrix")
pred_rating_UBCF_bi <- as(predict(object = model_UBCF_bi, newdata = rrm_reduced_known_bi, n = 15,type="ratings"),"matrix")
# the test user "unknown" ratings
mx_reduced_unknown_bi <- as(rrm_reduced_unknown_bi,"matrix")
# evaluate recommendations on "unknown" ratings
acc_IB_bi <- calcPredictionAccuracy(predict(object = model_IBCF_bi, newdata = rrm_reduced_known_bi, n = 15,type="topNList"),rrm_reduced_unknown_bi,given=15,goodRating=1)
acc_UB_bi <- calcPredictionAccuracy(predict(object = model_UBCF_bi, newdata = rrm_reduced_known_bi, n = 15,type="topNList"),rrm_reduced_unknown_bi,given=15,goodRating=1)
acc_bi <- rbind(acc_IB_bi,acc_UB_bi)
rownames(acc_bi) <- c("IBCF binary","UBCF binary")
acc_bi <- as.data.frame(acc_bi) %>% mutate(f1_score = 2*precision*recall/(precision + recall))%>% select(-c(TP,FP,FN,TN,N,TPR,FPR))
as(acc_bi,'matrix')
## precision recall f1_score
## IBCF binary 0.4031746 0.01270659 0.02463672
## UBCF binary 0.1916667 0.02441571 0.04331382
acc_ubcf <- as(rbind(acc_UB,acc_UB_bi),'matrix')
acc_ubcf <- as.data.frame(acc_ubcf) %>% mutate(f1_score = 2*precision*recall/(precision + recall))%>% select(-c(TP,FP,FN,TN,N,TPR,FPR))
rownames(acc_ubcf) <- c("UBCF ordinal cosine","UBCF binary jaccard")
as(acc_ubcf,'matrix')
## precision recall f1_score
## UBCF ordinal cosine 0.1250000 0.02345749 0.03950203
## UBCF binary jaccard 0.1916667 0.02441571 0.04331382
# SVD MODEL
model_SVD_10 <- Recommender(data = rrm_reduced_train,method="SVD",parameter=list(normalize = "Z-score",k=10))
model_SVD_20 <- Recommender(data = rrm_reduced_train,method="SVD",parameter=list(normalize = "Z-score",k=20))
model_SVD_30 <- Recommender(data = rrm_reduced_train,method="SVD",parameter=list(normalize = "Z-score",k=30))
model_SVD_40 <- Recommender(data = rrm_reduced_train,method="SVD",parameter=list(normalize = "Z-score",k=40))
model_SVD_50 <- Recommender(data = rrm_reduced_train,method="SVD",parameter=list(normalize = "Z-score",k=50))
# evaluate recommendations on "unknown" ratings
acc_SVD_10 <- calcPredictionAccuracy(predict(object = model_SVD_10, newdata = rrm_reduced_known, n = 15,type="topNList"),rrm_reduced_unknown,given=15,goodRating = 4)
acc_SVD_20 <- calcPredictionAccuracy(predict(object = model_SVD_20, newdata = rrm_reduced_known, n = 15,type="topNList"),rrm_reduced_unknown,given=15,goodRating = 4)
acc_SVD_30 <- calcPredictionAccuracy(predict(object = model_SVD_30, newdata = rrm_reduced_known, n = 15,type="topNList"),rrm_reduced_unknown,given=15,goodRating = 4)
acc_SVD_40 <- calcPredictionAccuracy(predict(object = model_SVD_40, newdata = rrm_reduced_known, n = 15,type="topNList"),rrm_reduced_unknown,given=15,goodRating = 4)
acc_SVD_50 <- calcPredictionAccuracy(predict(object = model_SVD_50, newdata = rrm_reduced_known, n = 15,type="topNList"),rrm_reduced_unknown,given=15,goodRating = 4)
acc_IBCF_top15 <- calcPredictionAccuracy(predict(object = model_IBCF, newdata = rrm_reduced_known, n = 15,type="topNList"),rrm_reduced_unknown,given=15,goodRating = 4)
acc_SVD_IB <- rbind(acc_SVD_10,acc_SVD_20,acc_SVD_30,acc_SVD_40,acc_SVD_50,acc_IBCF_top15)
rownames(acc_SVD_IB) <- c("SVD_k_10","SVD_k_20","SVD_k_30","SVD_k_40","SVD_k_50","IBCF_cos_k_30")
#acc_SVD_IB <- as.data.frame(acc_SVD_IB) %>% mutate(.,models = c("SVD_10","SVD_20","SVD_30","SVD_40","SVD_50","IBCF_cos_30"))
acc_SVD_IB <- as.data.frame(acc_SVD_IB) %>% mutate(f1_score = 2*precision*recall/(precision + recall))%>% select(-c(TP,FP,FN,TN,N,TPR,FPR))
acc_SVD_IB$model <- c("SVD_10","SVD_20","SVD_30","SVD_40","SVD_50","IBCF_cos_30")
acc_SVD_IB
metrics_long <- acc_SVD_IB %>% pivot_longer(names_to = "metrics", values_to = "Value", precision:f1_score)
ggplot(metrics_long, aes(model, Value, fill = metrics)) +
geom_col(position = "dodge")+
labs(title="Compare SVD and IBCF models") +
theme(plot.title = element_text(hjust = 0.5))
calc_topn_metrics <- function(mx,split_meth,split_ratio,split_k,nr,meth,para){
# mx: U_I data; split_meth: 'cross' or 'split'; split_ratio: 0.8 for example; split_k: an integer, 1 for split, other numbers for cross-validation ; nr: Top-N; meth: model method,e.g. "IBCF"; para:parameter for models
rrm <- as(mx,"realRatingMatrix")
# split train, test-known, test_unknown data
set.seed(1965)
train_test <- evaluationScheme(rrm, method=split_meth, train=split_ratio, k=split_k, given=15,goodRating=4)
rrm_train <- getData(train_test,"train")
rrm_known <- getData(train_test,"known")
rrm_unknown <- getData(train_test,"unknown")
# model
model <-Recommender(data = rrm_train,method=meth,parameter=para)
# predict Top-N recommendation list on "known" test set
pred_test_user<- predict(object = model, newdata = rrm_known, n = nr,type="topNList")
TOP_N_list <- sapply(pred_test_user@items, function(x) {colnames(as(rrm_known,"matrix"))[x]}) # top-n list for all test users
####################################
### accuracy: evaluate the recommendations on "unknown" test set
acc_ <- calcPredictionAccuracy(pred_test_user,rrm_unknown,given=15,goodRating = 4)
acc_ <- t(as.data.frame(acc_))
rownames(acc_) <- NULL
####################################
### item-space coverage: refers to the proportion of items that the recommendation system can recommend. The simplest measure is the percentage of all films that in the top-n recommendation lists divided by all potential films.
# predict top-n lists of all user
TOP_N_list <- sapply(pred_test_user@items, function(x) {colnames(as(rrm_known,"matrix"))[x]})
# unique predicted film list of all test users
uniq_film_test <- reshape2::melt(as(TOP_N_list,"matrix")) %>% rename(UserID = Var2, rank = Var1, Film_name = value)%>%distinct(Film_name) # unique film list recommended in test data
# unique film list of data
uniq_film <- as.data.frame(t(mx))
uniq_film$cnt <- rowSums(!is.na(uniq_film)) # count not NA for each film
uniq_film <- uniq_film %>% filter(cnt>0) # remove the film without any ratings
# calculate the item-space coverage
coverage <- dim(uniq_film_test)[1] / dim(uniq_film)[1] # the coverage
coverage_table <- as.data.frame(coverage)
colnames(coverage_table) <- "coverage"
######################################
### novelty for a given user: recommended movies that the user did not know about.
novelty_table <- data.frame() # an empty dataframe, will be filled with novelty values
# extract the full test set (with known and unknown ratings)
a <- (as.data.frame(as(rrm_known,'matrix')))
df_test <- as.data.frame(mx) %>% mutate(as.data.frame(mx),idx=rownames(as.data.frame(mx)))
a <- a %>% mutate(a, idx = rownames(a)) #%>% select(index)
df_test <- semi_join(df_test,a,by=c("idx"="idx")) %>% select(.,-c(idx)) # full test set
# df_i: replace the not NA values to the corresponding column name,
# df_i contains all films that user i has rated, or say the user i has known.
for(i in 1:dim(df_test)[1]){
df_i <- as.data.frame(t(df_test))#
df_i$Film <- colnames(df_test) # set index (film names) as a new column 'Film'
df_i <- df_i[,c(i,(dim(df_test)[1]+1))] %>% filter(complete.cases(.)) # list of user-i rated films
df_i$Film <- rownames(df_i) # add a new column with the same content of rownames
df_top_n <- as.data.frame(TOP_N_list[,i]) # top-n list of user-i
colnames(df_top_n) <- "Film"
df_cross <- inner_join(df_i, df_top_n, by="Film") # inner join the two dataset, we get the recommended films that user already known.
# calculate novelty
novelty <- 1 - dim(df_cross)[1]/nr # novelty value of user-i
novelty_table <- rbind(novelty_table, novelty)
}
novelty_table$UserID <- rownames(df_test)
colnames(novelty_table) <- c("novelty","UserID")
novelty_table <- novelty_table %>% select(UserID,novelty) # novelty table
metrics<- as.data.frame(acc_) %>% mutate(coverage = mean(coverage_table$coverage),novelty = mean(novelty_table$novelty), f1_score = 2*precision*recall/(precision + recall),Top_N = nr) %>% select(-c(N,TN,FN,FP,TP,TPR,FPR))
return(metrics)
}
N_lst = c(5, 10, 15, 20, 25, 30)
metrics <- data.frame()
for(nr in N_lst){metrics <- bind_rows(metrics,calc_topn_metrics(mx_reduced,'split',0.8,1,nr,"IBCF",list(normalize = "Z-score",method="Cosine",k=30)))}
metrics$models <- 'IBCF'
metrics
metrics_8_long <- metrics %>% pivot_longer(names_to = "metrics", values_to = "Value", precision:f1_score)
ggplot(metrics_8_long, aes(x=Top_N, y = Value, group = metrics, colour=metrics)) +
geom_line() +
geom_point()+
labs(title="IBCF metrics with different N values") +
theme(plot.title = element_text(hjust = 0.5))
Why use different metrics to evaluate recommender system?
model_lst = c('IBCF','UBCF','SVD','RANDOM','LIBMF')
para_lst = c(list(normalize = "Z-score",method="Cosine",k=30),list(normalize = "Z-score",method="cosine",nn=30),list(normalize = "Z-score",k=30),NULL,NULL)
metrics_91 <- data.frame()
for(i in 1:5){metrics_91 <- bind_rows(metrics_91,calc_topn_metrics(mx_reduced,'cross',0.8,10,15,model_lst[i],para_lst[i]))}
## Warning: Unknown parameter: method
## Available parameter (with default values):
## dim = 10
## costp_l2 = 0.01
## costq_l2 = 0.01
## nthread = 1
## verbose = FALSE
metrics_91$models <- c('IBCF','UBCF','SVD','RANDOM','LIBMF')
rownames(metrics_91) <- metrics_91$models
metrics_91
metrics_long <- metrics_91 %>% pivot_longer(names_to = "metrics", values_to = "Value", coverage:f1_score)
ggplot(metrics_long, aes(models, Value, fill = metrics)) +
geom_col(position = "dodge")+
labs(title="Compare 5 models with 10-fold cross-validation") +
theme(plot.title = element_text(hjust = 0.5))
N_lst = c(10,15,20,25,30)
metrics_93 <- data.frame()
for(i in 1:5){metrics_93 <- bind_rows(metrics_93,calc_topn_metrics(mx_reduced,'cross',0.8,10,N_lst[i],'IBCF',list(normalize = "Z-score",method="Cosine",k=30)))}
metrics_93$models <- 'IBCF'
metrics_93
metrics_93_long <- metrics_93 %>% pivot_longer(names_to = "metrics", values_to = "Value", precision:f1_score)
ggplot(metrics_93_long, aes(x=Top_N, y = Value, group = metrics, colour=metrics)) +
geom_line() +
geom_point()+
labs(title="Best model (IBCF) with different N values") +
theme(plot.title = element_text(hjust = 0.5))
When increase the length of recommendation list, the coverage increases from 0.22 to 0.47,
all other metrics have smaller increase or decrease (precision).
# films with the highest average ratings ( avg_ratings > 3)
df_top_avg <- as.data.frame(t(df_reduced))
df_top_avg <- df_top_avg %>% mutate(avg_rating = rowMeans(df_top_avg,na.rm=TRUE,dims=1))%>% arrange(desc(avg_rating))%>% filter(avg_rating>3) %>% select(-avg_rating)
mx_top_avg <- t(df_top_avg)
dim(mx_top_avg)
## [1] 400 566
Filter the films with average ratings > 3.
The new dataframe has 566 films.
Now I would like to tune hyperparameters of model IBCF:
# hyperparameter tuning
k_neighbor_lst <- c(15,20,30,50,100)
normalize_lst <- c('center','Z-score')
similarity_lst <- c('cosine','Pearson')
metrics_94 <- data.frame()
for(i in 1:length(normalize_lst)){
for(j in 1:length(similarity_lst)){
for(m in 1:length(k_neighbor_lst)){
metrics_new <- calc_topn_metrics(mx_top_avg,'cross',0.8,10,20,'IBCF',list(normalize = normalize_lst[i],method=similarity_lst[j],k=k_neighbor_lst[m]))
metrics_new$normalize <- normalize_lst[i]
metrics_new$similarity<- similarity_lst[j]
metrics_new$k_neighbor <- k_neighbor_lst[m]
metrics_94 <-bind_rows(metrics_94,metrics_new)}}}
metrics_94 <- metrics_94 %>% select(-Top_N)
metrics_94
metrics_94_long <- metrics_94 %>% pivot_longer(names_to = "metrics", values_to = "Value", precision:f1_score)
ggplot(metrics_94_long, aes(x=k_neighbor, y = Value, group = metrics, colour=metrics)) +
geom_line() +
geom_point() +
facet_grid(cols=vars(normalize),rows=vars(similarity)) +
labs(title="Hyperparameter tuning of best(IBCF) model") +
theme(plot.title = element_text(hjust = 0.5))
Here I tried to optimize the best (IBCF) model through three parameters normalization, similarity, k neighbours.
The two normaliazation methods center and z-score have very similiar performance on the metrics.
With Pearson similarity, the coverage is higher. When k = 15 or k = 20, the novelty values are also larger; With cosine similarity, the precision values are larger when k < 100.
k_neighbour: coverage and novelty values are largest with k = 100. Precision, recall, f1-value have largest values at around k=30 or k = 20.
Finally for IBCF model I choose the parameters “k = 30, center normalization, cosine similarity”, as with this combination IBCF model has a good balance of metrics coverage, novelty, and f1-score.
best_IBCF <- metrics_94 %>% filter(.,normalize=="center"& similarity == "cosine" & k_neighbor == 30)
best_IBCF
# use split ratio = 0.95, will get exactly 20 users in test set
neighbor_lst <- c(15,20,30,50,100) # k neighbours parameter
normalize_lst <- c('center','Z-score') # normalization methods
similarity_lst <- c('cosine','Pearson') # similarity methods for ordinal data
k_dimred <- c(10,20,30,40,50) #dimension reduction parameter for model SVD
# IBCF UBCF and SVD
metrics_IBCF <- data.frame()
metrics_UBCF <- data.frame()
metrics_SVD <- data.frame()
metrics_SVDF <- data.frame()
for(i in 1:length(normalize_lst)){
for(m in 1:length(neighbor_lst)){
metrics_new_SVD <- calc_topn_metrics(mx_top_avg,'split',0.95,1,20,'SVD',list(normalize = normalize_lst[i],k=k_dimred[m]))
metrics_new_SVD$normalize <- normalize_lst[i]
metrics_new_SVD$k_dim <- k_dimred[m]
metrics_new_SVD$model <- 'SVD'
metrics_SVD <-bind_rows(metrics_SVD,metrics_new_SVD)
metrics_new_SVDF <- calc_topn_metrics(mx_top_avg,'split',0.95,1,20,'SVDF',list(normalize = normalize_lst[i],k=k_dimred[m]))
metrics_new_SVDF$normalize <- normalize_lst[i]
metrics_new_SVDF$k_dim <- k_dimred[m]
metrics_new_SVDF$model <- 'SVDF'
metrics_SVDF <-bind_rows(metrics_SVDF,metrics_new_SVDF)
for(j in 1:length(similarity_lst)){
metrics_new_IBCF<-calc_topn_metrics(mx_top_avg,'split',0.95,1,20,'IBCF',list(normalize = normalize_lst[i],method=similarity_lst[j],k=neighbor_lst[m]))
metrics_new_IBCF$normalize <- normalize_lst[i]
metrics_new_IBCF$similarity<- similarity_lst[j]
metrics_new_IBCF$k_neighbor <- k_neighbor_lst[m]
metrics_new_IBCF$model <- 'IBCF'
metrics_IBCF <-bind_rows(metrics_IBCF,metrics_new_IBCF)
metrics_new_UBCF <-calc_topn_metrics(mx_top_avg,'split',0.95,1,20,'UBCF',list(normalize = normalize_lst[i],method=similarity_lst[j],nn=neighbor_lst[m]))
metrics_new_UBCF$normalize <- normalize_lst[i]
metrics_new_UBCF$similarity<- similarity_lst[j]
metrics_new_UBCF$nn_neighbor <- neighbor_lst[m]
metrics_new_UBCF$model <- 'UBCF'
metrics_UBCF <-bind_rows(metrics_UBCF,metrics_new_UBCF)
}}}
metrics_IBCF <- metrics_IBCF %>% select(-Top_N)
metrics_UBCF <- metrics_UBCF %>% select(-Top_N)
metrics_SVD <- metrics_SVD %>% select(-Top_N)
metrics_SVDF <- metrics_SVDF %>% select(-Top_N)
metrics_IBCF
metrics_UBCF
metrics_SVD
metrics_SVDF
metrics_IBCF_long <- metrics_IBCF %>% pivot_longer(names_to = "metrics", values_to = "Value", precision:f1_score)
ggplot(metrics_IBCF_long, aes(x=k_neighbor, y = Value, group = metrics, colour=metrics)) +
geom_line() +
geom_point() +
facet_grid(cols=vars(normalize),rows=vars(similarity)) +
labs(title="Hyperparameter tuning of IBCF model (20 test users)", x="k neighbors") +
theme(plot.title = element_text(hjust = 0.5))
metrics_UBCF_long <- metrics_UBCF %>% pivot_longer(names_to = "metrics", values_to = "Value", precision:f1_score)
ggplot(metrics_UBCF_long, aes(x=nn_neighbor, y = Value, group = metrics, colour=metrics)) +
geom_line() +
geom_point() +
facet_grid(cols=vars(normalize),rows=vars(similarity)) +
labs(title="Hyperparameter tuning of UBCF model (20 test users)",x="nn neighbors") +
theme(plot.title = element_text(hjust = 0.5))
metrics_SVD_long <- metrics_SVD %>% pivot_longer(names_to = "metrics", values_to = "Value", precision:f1_score)
g1 = ggplot(metrics_SVD_long, aes(x=k_dim, y = Value, group = metrics, colour=metrics)) +
geom_line() +
geom_point() +
facet_grid(cols=vars(normalize)) +
labs(title="Hyperparameter tuning of SVD model (20 test users)",x="k dimension reduction") +
theme(plot.title = element_text(hjust = 0.5))
metrics_SVDF_long <- metrics_SVDF %>% pivot_longer(names_to = "metrics", values_to = "Value", precision:f1_score)
g2 = ggplot(metrics_SVDF_long, aes(x=k_dim, y = Value, group = metrics, colour=metrics)) +
geom_line() +
geom_point() +
facet_grid(cols=vars(normalize)) +
labs(title="Hyperparameter tuning of SVDF model (20 test users)",x="k dimension reduction") +
theme(plot.title = element_text(hjust = 0.5))
grid.arrange(g1, g2, nrow = 2)
best_IBCF <- metrics_IBCF %>% filter(.,normalize=="center"& similarity == "cosine" & k_neighbor == 20)%>%select(c(precision,recall,coverage,novelty,f1_score,model))
best_UBCF <- metrics_UBCF %>% filter(.,normalize=="Z-score"& similarity == "cosine" & nn_neighbor == 50)%>%select(c(precision,recall,coverage,novelty,f1_score,model))
best_SVD <- metrics_SVD %>% filter(.,normalize=="center"& k_dim == 50)%>%select(c(precision,recall,coverage,novelty,f1_score,model))
best_SVDF <- metrics_SVDF %>% filter(.,normalize=="center"& k_dim == 10)%>%select(c(precision,recall,coverage,novelty,f1_score,model))
best_models <- rbind(best_IBCF,best_UBCF,best_SVD,best_SVDF)
best_models
best_models_long <- best_models %>% pivot_longer(names_to = "metrics", values_to = "Value", precision:f1_score)
ggplot(best_models_long, aes(model, Value, fill = metrics)) +
geom_col(position = "dodge")+
labs(title="Compare 4 best models (20 test users)") +
theme(plot.title = element_text(hjust = 0.5))
Top_n_genre <- function(Top_n_list,n){ # Top_n_list: top-n matrix; n: the "n" in top-n
table = data.frame()
for(i in 1:dim(Top_n_list)[2]){
df_top <- as.data.frame(Top_n_list[,i])
colnames(df_top) <- "Film"
df_film_genre_1 <- as.data.frame(mx_film_genre)
df_film_genre_1$Film <- rownames(df_film_genre_1)
df_top <- left_join(df_top,df_film_genre_1,by=("Film")) %>% select(-Film)
df_top <- df_top[-1,]
total <- sum(df_top)
df_top["ratio",] <- colSums(df_top)/total
df_top <- df_top["ratio",]
table <- rbind(table,df_top)
}
rownames(table) <- colnames(Top_n_list)
return(table)
}
# use the same split setting as in the last task to make sure that it is always the same train set, same 20 test users.
set.seed(1965)
train_test_10 <- evaluationScheme(as(mx_top_avg,"realRatingMatrix"), method="split", train=0.95, k=1, given=15,goodRating=4)
# training dataset has 380 users,test dataset has 20 users
# given=15: For each test user, 15 films per user will be used for prediction, the rest for evaluation)
rrm_reduced_train_10 <- getData(train_test_10,"train")
rrm_reduced_known_10 <- getData(train_test_10,"known")
rrm_reduced_unknown_10 <- getData(train_test_10,"unknown")
# Use the best model (selected in the last task): ICBF models with center normalization, cosine similarity, and 20 neighbors
model_IBCF <-Recommender(data = rrm_reduced_train_10,method="IBCF",parameter=list(normalize = "center",method="cosine",k=20))
# Top-15 recommendation list of
Pred_ <- predict(object = model_IBCF, newdata = rrm_reduced_known_10, n = 15,type=c("topNList"))
Top15_ <- sapply(Pred_@items, function(x) {colnames(mx_top_avg)[x]})
Top_15_recommendations <- Top_n_genre(Top15_,15) # the percentage of top-15 recommendations by genre per test user
rownames(Top_15_recommendations) <- rownames(as.data.frame(as(rrm_reduced_known_10,"matrix")))
Top_15_recommendations
# check if the table works correct. Each row should have the sum of 1
rowSums(Top_15_recommendations)
## 655 303 234 7 268 194 374 868 840 807 189 488 608 763 197 253 466 523 871 159
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
ggplot(stack(Top_15_recommendations), aes(x = reorder(ind,-values), y = values)) +
geom_boxplot() + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1),plot.title = element_text(hjust = 0.5)) + labs(x=NULL,y="Proportion",title = "Genre proportion in Top-15 recommendations per user")
In this part, the goal is to analyse the genre proportion of the predicted films per user.
In the table, each row represents the genre proportion in the top-15 recommendation list of one user. This means the sum of each row should be 1.
In the short calculation under the table, we could see that all rowsum are equal to 1. This demonstrates that proportion table is correctly calculated.
The boxplot showed us the proportion of all 18 film genres. Each box includes the data of 20 test users.
We could see that Drama has the highest proportion in top-15
recommendations. And Drama genre is the only genre which is recommended
to all 20 users (there is no zero proportion), the smallest proportion
is about 0.07, the highest proportion is about 0.44.
Animation, Children’s, Documentary, Fantasy and Musical have the least proportion. And they appeared only in very few users’ top-15 recommendations.
(=Filme, welche vom Kunden die besten Bewertungen erhalten haben)
Now let’s have a look, how the genres proportion in the true data per user.
# firstly, select the twenty users which are used in the last two tasks, with the top average-ratings films.
mx_users_20 <- mx_top_avg[c(rownames(Top_15_recommendations)),]
as.data.frame(mx_users_20)
Top_film_genre <- function(mx,n){ # mx is user-item matrix, n: top-n
table = data.frame()
for(i in 1:dim(mx)[1]){ # for each user
df_top <- as.data.frame(t(mx))
df_top$Film <- rownames(df_top)
df_top <- df_top[,c(i,(dim(mx)[1]+1))]
colnames(df_top) <- c("Ratings","Film")
# filter only the films with ratings -> sort by ratings -> select the top-n rated films
df_top <- df_top %>% filter(complete.cases(.)) %>% arrange(desc(Ratings)) %>% slice(1:n)
# use the data film_genre, left join to the top-n rated films
# then will get 18 genre columns, with the value of 1 or 0 (because each film could have many genres)
df_film_genre_1 <- as.data.frame(mx_film_genre)
df_film_genre_1$Film <- rownames(df_film_genre_1)
df_left_join <- left_join(df_top,df_film_genre_1,by=("Film"))
df_top <- df_left_join[,-(1:2)]
total <- sum(df_top) # total genres involved in the top-n rated films
# generate a new row called "ratio", with the proportion of each genre
df_top["ratio",] <- colSums(df_top)/total
df_top <- df_top["ratio",] # keep only the ratio row
table <- rbind(table,df_top) # bind this row to the output table
}
rownames(table) <- rownames(mx)
return(table)
}
Top_15_films <- Top_film_genre(mx_users_20,15) # genres proportion of the top 15 films for every user
Top_15_films
rowSums(Top_15_films)
## 655 303 234 7 268 194 374 868 840 807 189 488 608 763 197 253 466 523 871 159
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
ggplot(stack(Top_15_films), aes(x = reorder(ind,-values), y = values)) +
geom_boxplot() + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1),plot.title = element_text(hjust = 0.5)) + labs(x=NULL,y="Proportion",title = "Genre proportion in Top-15 favourite films per user")
The first table shows the genre proportion in the top-15 rated films per user.
The second brief calculation is the sum of each row. The result shows that all are 1, which indicates that the proportion table is correctly calculated.
The box-plot shows proportion of all the 18 genres. Each box includes the proportion of 20 test users.
# combine the predicted and true tables with labels. then convert it to a long table.
top_1 <- Top_15_films
top_1$type <- 'predicted'
top_2 <- Top_15_recommendations
top_2$type <- 'true'
top_bind <- rbind(top_1,top_2)
top_bind_long <- melt(top_bind, id = "type")
# plot
ggplot(top_bind_long, aes(x = reorder(variable,-value), y = value,color=type)) +
geom_boxplot() + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1),plot.title = element_text(hjust = 0.5)) + labs(y="Proportion",x = "Genres",title = "Genre proportion of Top-15 recommendations and Top-15 favourites per user")
Drama has the highest medium proportion in both true and predicted groups.
Animation, Children’s, Documentary, Fantasy, Musical have the least proportion in both groups
Most of the genres have similiar distributions in both groups with a small differences except Western.
Western has much higher proportions in the users’ true favourite lists than in the prediction. This means some users in this group like Western, but they get less Western films recommended than expected.
evaluate the Top-recommendations comparing to Top_films by quantitative metrics MAE(mean average error,more robust than MSE), MSE(mean squared error,more sensitive to outliers than MAE), RMSE(the root mean squared error).
The more robust metric MAE has the medium of 0.034.
df_diff = Top_15_films - Top_15_recommendations
# calculate the mean absolute error
MAE_top_genre <- rowSums(abs(df_diff))/20
MSE_top_genre <- rowSums(df_diff^2)/20
RMSE_top_genre <- sqrt(rowSums(df_diff^2)/20)
quantitative_metrics <- t(rbind(MAE_top_genre,MSE_top_genre,RMSE_top_genre))
quantitative_metrics <- as.data.frame(quantitative_metrics)
colnames(quantitative_metrics) <- c("MAE","MSE","RMSE")
# plot
ggplot(stack(quantitative_metrics), aes(x = reorder(ind,values), y = values)) +
geom_boxplot() + theme(axis.text.x = element_text(angle = 0, vjust = 0.5, hjust=1),plot.title = element_text(hjust = 0.5)) + labs(x=NULL,y=NULL,title = "Quantitative metrics of Top-recommendations vs Top-films")
I will use a ranking metric MAP (Average Precision and Mean Average Precision) to qualitatively evaluate the Top-n recommendations.
It works like this:
for each user, the top-N1 genres will be labeled as relavant genres, others are non-relavant genres. For example, if choose 6, the top-6 true genres will be labeled as relavant, others as non-relavant.
set the cut-off N2: evaluate the top-N2 genres in the recommendation list.
Now go to the prediction list (the top-recommendations), sort the genres by prediction rankings (rank 1, 2, 3,….).
Scan the N2 best ranked items in prediction list. Find the prediction positions (prediction ranks) of relavant genres.
calculate the average precision of this user like this: mean ( 1/position1 + 2/position2 + 3/position3 + …).
Some users may have more relavant genres, some less.
then calculate the mean of average-precisions (MAP) from all users.
But MAP metric has the drawback that it does not consider the recommended list as an ordered list.
I will define two functions to calculate the average-precisions of all users and the average of average-precisions of all users.
define a function avg_precision, which calculates the average precision of one user. This function will be applied in the function MAP.
# calculate average precision of one user. The function will be applied in MAP function
# input: dataframe with columns of rank_true, rank_pred, genres
# N1: max number of possible relavant items
# N2: scan the top-N2 recommendations
avg_precision <- function(df,N1,N2){
count_relv = 0
sum_relv = 0
for(i in 1:N2){
row_i <- df[i,]
if (row_i["rank_true"] <= N1){# this means it is a relavant genre
count_relv = count_relv + 1
sum_relv = sum_relv + count_relv/i
}}
return(sum_relv/count_relv)}
# function MAP will calculate the average precision of all users.
# input: top_n_films, top_n_predictions, N1(how many relavant items), N2 (scan the top-N2 items in predictions)
# output: a dataframe with the average precisions of all users.
MAP <- function(top_true,top_pred,N1,N2){
avg_precision_list = NULL # the list used to collection the average-precision of all users
for(i in 1:dim(top_true)[1]){ # each time analyse one user
# transpose
top_true_i <- as.data.frame(t(top_true[i,]))
top_pred_i <- as.data.frame(t(top_pred[i,]))
# rename the column
colnames(top_true_i) <- 'proportion'
colnames(top_pred_i) <- 'proportion'
# sort by value
top_true_i <- top_true_i %>% arrange(.,desc(proportion))%>%select(-proportion)
top_pred_i <- top_pred_i %>% arrange(.,desc(proportion))%>%select(-proportion)
# generate new column "rank"
top_true_i$rank_true <- 1:18
top_pred_i$rank_pred <- 1:18
# copy rownames as a column "genres", drop rownames
top_true_i$genres <- rownames(top_true_i)
top_pred_i$genres <- rownames(top_pred_i)
rownames(top_true_i)<-NULL
rownames(top_pred_i)<-NULL
# left-join top_true_i to top_pred_i
top_join <- left_join(top_pred_i,top_true_i,by="genres")
avg_precision_i <- avg_precision(top_join,N1,N2)
avg_precision_list <- c(avg_precision_list,avg_precision_i)}
df_ap <- as.data.frame(avg_precision_list)
rownames(df_ap ) <- rownames(top_true)
colnames(df_ap) <- 'average_precision'
return(df_ap)
}
top_true <- Top_15_films
top_pred <- Top_15_recommendations
result_105 <- MAP(top_true,top_pred,5,5) %>% arrange(.,desc(average_precision))
result_105
barplot(height = result_105[,1],xlab="userID",ylab="average precision",names.arg=c(rownames(result_105)),las=2)
print(paste("mean of average-precision (MAP@5) is", colMeans(result_105)))
## [1] "mean of average-precision (MAP@5) is 0.82625"
metrics_by_film <- best_models[1,] %>% select(-model)
metrics_by_film
metrics_by_genres <- as.data.frame(mean(quantitative_metrics$MAE)) %>% mutate(.,MAP=colMeans(result_105))
colnames(metrics_by_genres) <- c("avg_MAE","MAP")
metrics_by_genres
- my recommender system use IBCF model with center normalization, cosine similarity and 20 neighbors
- Through four models IBCF UBCF SVD SVDF, with different parameter combinations, my model has the best balance of metrics coverage (0.33), novelty (0.52), f1_score (0.14), precision (0.34), and recall (0.09).
- Evaluate the recommender system by genres of the top-N recommendations and true top-N favourite films quantitatively (mean absolute error) and qualitatively (mean of average precision)
- The predicted genres proportion and true genres proportion have showed very similiar trends, and with a low mean absolute error of 0.035.
- Evaluate the genre by ranking metric mean of average precision (MAP@5), it reaches a high MAP of 0.83.
- My recommender system could give personalized recommendation based on genres with high precision.